*Process AP data

global raw1="$raw/AP/"


***********************************
** AP SCORES **
** This section cleans AP standardized
** score *
***********************************

foreach y of numlist 2007/2022{
import excel "$raw1/ap_`y'_student_level_data.xlsx", sheet("Sheet1") firstrow clear
ren SD_SASID sasid
keep sasid AP*
ren AP_SUBJ_CODE ap
ren AP_SUBJ_SCORE ap_
drop AP
replace ap=lower(ap)
format sasid %12.0g
cap duplicates drop sasid ap, force
reshape wide ap_, i(sasid) j(ap) string

ren ap_* ap_*gr

	g yearap=`y'
	duplicates drop
		save "${saves}/apraw_`y'.dta", replace 
}


import excel "$raw1/ap_2023_student_level_data.xlsx", sheet("Export Worksheet") firstrow clear
ren SD_SASID sasid
keep sasid AP*
ren AP_SUBJ_CODE ap
ren AP_SUBJ_SCORE ap_
drop AP
replace ap=lower(ap)
format sasid %12.0g
cap duplicates drop sasid ap, force
reshape wide ap_, i(sasid) j(ap) string

ren ap_* ap_*gr

	g yearap=2023
	duplicates drop
		save "${saves}/apraw_2023.dta", replace 
		
		
		********FORMAT DATA****************************
		foreach n of numlist 2007/2023 {			
			use "${saves}/apraw_`n'.dta", clear
			*Keep only guys taking regular test
			cap drop if restestap!=0
			
			*Recode year and zip code
			cap gen year=cohortap
			destring year, replace
			cap tostring zipap, replace
			
			*dob
			cap tostring dobap, replace
			cap gen year_ob=substr(dobap,-2,2)
			cap gen month_ob1=substr(dobap,1,1)
			cap gen month_ob2=substr(dobap,1,2)
			cap gen day_ob1=substr(dobap,2,2)
			cap gen day_ob2=substr(dobap,3,2)
			cap gen length_dob=length(dobap)	
			cap gen day_ob=day_ob1 if length_dob==5
			cap replace day_ob=day_ob2 if length_dob==6
			cap gen month_ob=month_ob1 if length_dob==5
			cap replace month_ob=month_ob2 if length_dob==6
			cap drop day_ob1 day_ob2 month_ob1 month_ob2
			cap gen blah="19"
			cap egen year_ob2=concat(blah year_ob)
			cap drop year_ob
			cap ren year_ob2 year_ob
			cap egen dob=concat(month_ob day_ob year_ob), punct("/")
			cap gen dob_date=date(dob,"MDY")
			cap drop *_ob
					
			*Center scores by year
			foreach x of varlist *gr {
				*ren `x' ap_`x'
				center `x' if `x'!=., standardize
			}

			*Rename vars
			cap ren lastap lname
			cap ren firstap fname
			cap ren miap mname
			
			*Keep vars
			*keep year fname mname lname dob ap_* c_* dob_date aprmap
			save "${saves}/aps_`n'.dta", replace
		}
		**********Combine and save**************
		clear
		foreach n of numlist 2007/2023 {
			append using "${saves}/aps_`n'.dta"
		}
		
		tab year
		duplicates drop
		tab year
		
		save "${saves}/ap_combined.dta", replace

				cap rename year yearap

	drop c_*
	cap replace yearap=cohortap if cohortap!=.
		    *Keep max score listed on each testcoho
			set more off
			foreach x of varlist ap_*  {
				bys sasid:  egen max_`x'=max(`x')
				replace `x'=max_`x'
				drop max_`x'
			}
			*Keep all AP files for more detailed AP analysis (added by Astrid)
			preserve
			rename * *_apdb
			rename sasid_apdb sasid
			save "$data_clean/ap_allyears.dta", replace
			restore
			
			*Keep one observation per kid -- the yearAP represents the year the most recent AP was taken
			gsort sasid -year
			by sasid:  keep if _n==1

			*Code number of subjects taken
			foreach x of varlist ap_* {
				gen took_`x'=`x'!=.
			}
			
	egen n_aps_taken       = rownonmiss(ap_*)
	egen max_ap_score    = rowmax(ap_*)

			*Save
			keep sasid max_ap_score n_aps_taken ap_*  yearap
	
			sort sasid
			
			*Check duplicity 
			duplicates report sasid
			*no dups
			
save "$data_clean/ap.dta", replace
